Preparation of the data

library(xlsx)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
data=read.xlsx('/Users/hani/Documents/Elec-forecast/Elec-train.xlsx', sheetIndex=1,header=TRUE)
colnames(data) <- c("Time", "Power", "Temp")

Modification of the type of time data from character to POSIXct:

data$Time = as.POSIXct(data$Time, tz="GMT", format = "%m/%d/%Y %H:%M")

Plot of the time series Power Consumption and Temperature:

plot(data$Time,data$Power, type='l')

plot(data$Time, data$Temp,type='l')

Creation of time series objects:

power = ts(data$Power, start= c(1,6), end=c(47,96), freq=96)
temperature = ts(data$Temp, start=c(1,6), end=c(47,96),freq=96)

Analysis of the time series Power Consumption

Creation of train and test dataset:

power_train =window(power,start=c(1,6), end=c(40,96))
power_test = window(power, start=c(41,1), end=c(47,96))
temp_train =window(temperature,start=c(1,6), end=c(40,96))
temp_test = window(temperature, start=c(41,1), end=c(47,96))

Plot of the power time series:

plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)

The trend of the time series suggest that we have a periodicity.

More details with the decomposition plot:

autoplot(decompose(power))

ggtsdisplay(power_train)

The seasonality is confirmed with the shape of the acf plot and decompose plot. The period is 96 which correspond to the total number of observations per day. Furthermore, the seasonality seems to be additive as we do not see a significant evolution of the variance on the plot.

Forecasting without covariates

Exponentional smoothing model

We start by considering an exponential smoothing model.

fit_hw = HoltWinters(power_train,seasonal='additive')
prev_hw= forecast(fit_hw, h=672)
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
lines(prev_hw$mean,col=2)

RMSE for Exponential smoothing:

rmse_hw=sqrt(mean((prev_hw$mean-power_test)^2))
rmse_hw
## [1] 20.19866

Auto SARIMA model without covariates

fit=auto.arima(power_train)
prev=forecast(fit, h=672)
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
lines(prev$mean,col=2)

fit
## Series: power_train 
## ARIMA(1,0,0)(0,1,0)[96] 
## 
## Coefficients:
##          ar1
##       0.7764
## s.e.  0.0103
## 
## sigma^2 estimated as 100.1:  log likelihood=-13917.31
## AIC=27838.61   AICc=27838.62   BIC=27851.07

The model generated is a SARIMA (1,0,0)(0,1,0)[96].

RMSE SARIMA without covariate:

rmse_sa = sqrt(mean((prev$mean-power_test)^2))
rmse_sa
## [1] 15.37603

Residuals checking on the SARIMA model automatically generated:

checkresiduals(fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(0,1,0)[96]
## Q* = 1495.9, df = 191, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 192
ggPacf(fit$residuals)

There are still some autocorrelations on residuals which suggest that they are not totally modelized.

Manual SARIMA without covariate

tmp=diff(power_train,lag=96)
ggAcf(tmp)

ggtsdisplay(tmp)

Looking at the acf trend, it seems that there is still a trend.

ggtsdisplay(diff(tmp))

We observe a very significant spike on lag 96 on acf with an exponential decay on pacf seasonal lags. This suggests a seasonal MA1. And we observe another significant spike on lag 4 on acf for which we will choose a non-seasonal MA4.

Following this, we can think of a SARIMA(0,1,4)(0,1,1)[96]

fitx=auto.arima(power_train,stationary=TRUE, seasonal=TRUE)
fitx
## Series: power_train 
## ARIMA(5,0,1) with non-zero mean 
## 
## Coefficients:
##          ar1      ar2      ar3     ar4      ar5      ma1      mean
##       1.8800  -0.7717  -0.1742  0.0903  -0.0298  -0.9753  231.7521
## s.e.  0.0164   0.0345   0.0366  0.0346   0.0164   0.0032    1.0784
## 
## sigma^2 estimated as 205.8:  log likelihood=-15654.39
## AIC=31324.79   AICc=31324.83   BIC=31374.8
fitsam=arima(power_train,order=c(0,1,4),seasonal=list(order=c(0,1,1), period = 96))
fitsam
## 
## Call:
## arima(x = power_train, order = c(0, 1, 4), seasonal = list(order = c(0, 1, 1), 
##     period = 96))
## 
## Coefficients:
##           ma1      ma2      ma3      ma4     sma1
##       -0.2481  -0.0974  -0.1248  -0.3480  -0.8519
## s.e.   0.0161   0.0164   0.0184   0.0169   0.0094
## 
## sigma^2 estimated as 54.44:  log likelihood = -12837.35,  aic = 25686.7
checkresiduals(fitsam)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,4)(0,1,1)[96]
## Q* = 333.7, df = 187, p-value = 2.34e-10
## 
## Model df: 5.   Total lags used: 192
prev_sa_m=forecast(fitsam, h=672)
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
lines(prev_sa_m$mean,col=2)

RMSE Manual SARIMA without covariate:

rmse_sa_m= sqrt(mean((prev_sa_m$mean-power_test)^2))
rmse_sa_m
## [1] 14.74924

Neural Network Autoregressive model

fitnn=nnetar(power_train, T=96)
print(fitnn)
## Series: power_train 
## Model:  NNAR(20,1,11)[96] 
## Call:   nnetar(y = power_train, T = 96)
## 
## Average of 20 networks, each of which is
## a 21-11-1 network with 254 weights
## options were - linear output units 
## 
## sigma^2 estimated as 47.82
prevNN = forecast(fitnn, h=672)
autoplot(power_test)+
  autolayer(prev$mean,series="Auto SARIMA without covariate")+
  autolayer(prevNN$mean,series="NNAR")

RMSE NNAR:

rmse_NN=sqrt(mean((prevNN$mean-power_test)^2))
rmse_NN
## [1] 63.31427

Forecasting with covariate

Auto SARIMA model with covariate

We introduce the temperature as a covariate for the definition of the SARIMA model for Power.

fit_cov=auto.arima(power_train,xreg=temp_train)
prev_power_cov = forecast(fit_cov,h=672,xreg=temp_test)
autoplot(power_test) + 
  autolayer(prev_power_cov$mean, series = 'SARIMA with covariate')+
  autolayer(prev$mean,series="Auto SARIMA without covariate")+
  autolayer(prevNN$mean,series="NNAR")

fit_cov
## Series: power_train 
## Regression with ARIMA(1,0,0)(0,1,0)[96] errors 
## 
## Coefficients:
##          ar1    xreg
##       0.7744  0.3238
## s.e.  0.0104  0.2441
## 
## sigma^2 estimated as 100.1:  log likelihood=-13916.47
## AIC=27838.94   AICc=27838.94   BIC=27857.62

RMSE Auto SARIMA with covariate:

rmse_sa_co=sqrt(mean((prev_power_cov$mean-power_test)^2))
rmse_sa_co
## [1] 15.11528
checkresiduals(fit_cov)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0)(0,1,0)[96] errors
## Q* = 1498.5, df = 190, p-value < 2.2e-16
## 
## Model df: 2.   Total lags used: 192

There are still some auto-correlations on model residuals which suggest that residuals are not totally modeled.

We can try to find a better model manually.

Manual SARIMA with covariate

We start by looking the relationship between Power and Temperature.

power_train2 = cbind(Production=power_train,Temp=temp_train)
model=tslm(Production~Temp+trend+season,data=power_train2)
summary(model)
## 
## Call:
## tslm(formula = Production ~ Temp + trend + season, data = power_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -112.381   -4.565    0.205    4.631   59.010 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.562e+02  2.100e+00  74.398  < 2e-16 ***
## Temp         1.158e+00  9.775e-02  11.849  < 2e-16 ***
## trend       -4.875e-03  1.741e-04 -28.002  < 2e-16 ***
## season2     -6.668e-01  2.641e+00  -0.252  0.80070    
## season3     -6.834e+00  2.641e+00  -2.587  0.00971 ** 
## season4     -3.391e-01  2.641e+00  -0.128  0.89785    
## season5      2.848e+00  2.641e+00   1.078  0.28101    
## season6      3.093e+00  2.625e+00   1.178  0.23869    
## season7     -6.504e+00  2.625e+00  -2.478  0.01327 *  
## season8     -6.572e+00  2.625e+00  -2.503  0.01234 *  
## season9     -3.132e+00  2.625e+00  -1.193  0.23291    
## season10    -4.103e+00  2.625e+00  -1.563  0.11814    
## season11    -1.361e+00  2.625e+00  -0.518  0.60421    
## season12    -2.156e+00  2.625e+00  -0.821  0.41155    
## season13    -4.712e-01  2.625e+00  -0.179  0.85757    
## season14    -5.117e+00  2.626e+00  -1.948  0.05144 .  
## season15    -6.554e+00  2.626e+00  -2.496  0.01261 *  
## season16    -8.845e-01  2.626e+00  -0.337  0.73627    
## season17     5.479e-01  2.626e+00   0.209  0.83475    
## season18    -3.644e-01  2.627e+00  -0.139  0.88969    
## season19    -1.300e+00  2.627e+00  -0.495  0.62088    
## season20     4.478e-01  2.627e+00   0.170  0.86466    
## season21    -4.230e-02  2.627e+00  -0.016  0.98715    
## season22     9.778e-01  2.628e+00   0.372  0.70988    
## season23     2.015e+00  2.628e+00   0.767  0.44328    
## season24     4.280e+00  2.628e+00   1.629  0.10350    
## season25     3.832e+00  2.628e+00   1.458  0.14487    
## season26     6.949e+00  2.628e+00   2.644  0.00823 ** 
## season27     1.378e+01  2.628e+00   5.244 1.66e-07 ***
## season28     1.581e+01  2.628e+00   6.016 1.96e-09 ***
## season29     1.598e+01  2.628e+00   6.081 1.31e-09 ***
## season30     2.186e+01  2.628e+00   8.318  < 2e-16 ***
## season31     2.165e+01  2.628e+00   8.236 2.43e-16 ***
## season32     1.624e+01  2.628e+00   6.179 7.16e-10 ***
## season33     1.991e+01  2.628e+00   7.577 4.44e-14 ***
## season34     1.040e+02  2.628e+00  39.580  < 2e-16 ***
## season35     1.017e+02  2.628e+00  38.721  < 2e-16 ***
## season36     9.907e+01  2.628e+00  37.700  < 2e-16 ***
## season37     9.873e+01  2.628e+00  37.572  < 2e-16 ***
## season38     1.014e+02  2.624e+00  38.633  < 2e-16 ***
## season39     9.647e+01  2.624e+00  36.758  < 2e-16 ***
## season40     9.893e+01  2.624e+00  37.695  < 2e-16 ***
## season41     9.960e+01  2.624e+00  37.951  < 2e-16 ***
## season42     9.584e+01  2.626e+00  36.504  < 2e-16 ***
## season43     9.657e+01  2.626e+00  36.781  < 2e-16 ***
## season44     9.821e+01  2.626e+00  37.407  < 2e-16 ***
## season45     9.858e+01  2.626e+00  37.546  < 2e-16 ***
## season46     1.005e+02  2.630e+00  38.219  < 2e-16 ***
## season47     9.857e+01  2.630e+00  37.478  < 2e-16 ***
## season48     9.924e+01  2.630e+00  37.734  < 2e-16 ***
## season49     1.000e+02  2.630e+00  38.037  < 2e-16 ***
## season50     9.941e+01  2.637e+00  37.695  < 2e-16 ***
## season51     1.028e+02  2.637e+00  38.962  < 2e-16 ***
## season52     1.019e+02  2.637e+00  38.647  < 2e-16 ***
## season53     1.005e+02  2.637e+00  38.098  < 2e-16 ***
## season54     1.020e+02  2.642e+00  38.597  < 2e-16 ***
## season55     1.023e+02  2.642e+00  38.724  < 2e-16 ***
## season56     1.015e+02  2.642e+00  38.419  < 2e-16 ***
## season57     1.009e+02  2.642e+00  38.200  < 2e-16 ***
## season58     1.010e+02  2.648e+00  38.149  < 2e-16 ***
## season59     1.013e+02  2.648e+00  38.274  < 2e-16 ***
## season60     1.010e+02  2.648e+00  38.132  < 2e-16 ***
## season61     1.008e+02  2.648e+00  38.062  < 2e-16 ***
## season62     1.005e+02  2.649e+00  37.955  < 2e-16 ***
## season63     1.002e+02  2.649e+00  37.814  < 2e-16 ***
## season64     1.002e+02  2.649e+00  37.821  < 2e-16 ***
## season65     1.001e+02  2.649e+00  37.790  < 2e-16 ***
## season66     1.005e+02  2.644e+00  38.031  < 2e-16 ***
## season67     1.010e+02  2.644e+00  38.201  < 2e-16 ***
## season68     9.845e+01  2.644e+00  37.239  < 2e-16 ***
## season69     9.845e+01  2.644e+00  37.237  < 2e-16 ***
## season70     1.198e+02  2.636e+00  45.456  < 2e-16 ***
## season71     1.337e+02  2.636e+00  50.734  < 2e-16 ***
## season72     1.450e+02  2.636e+00  54.998  < 2e-16 ***
## season73     1.443e+02  2.636e+00  54.755  < 2e-16 ***
## season74     1.420e+02  2.629e+00  54.012  < 2e-16 ***
## season75     1.407e+02  2.629e+00  53.500  < 2e-16 ***
## season76     1.398e+02  2.629e+00  53.166  < 2e-16 ***
## season77     1.402e+02  2.629e+00  53.316  < 2e-16 ***
## season78     1.459e+02  2.627e+00  55.547  < 2e-16 ***
## season79     1.419e+02  2.627e+00  54.022  < 2e-16 ***
## season80     1.406e+02  2.627e+00  53.511  < 2e-16 ***
## season81     1.391e+02  2.627e+00  52.926  < 2e-16 ***
## season82     1.391e+02  2.626e+00  52.981  < 2e-16 ***
## season83     1.368e+02  2.626e+00  52.100  < 2e-16 ***
## season84     1.367e+02  2.626e+00  52.052  < 2e-16 ***
## season85     1.357e+02  2.626e+00  51.657  < 2e-16 ***
## season86     1.337e+02  2.625e+00  50.938  < 2e-16 ***
## season87     1.326e+02  2.625e+00  50.525  < 2e-16 ***
## season88     1.315e+02  2.625e+00  50.087  < 2e-16 ***
## season89     1.293e+02  2.625e+00  49.243  < 2e-16 ***
## season90     1.123e+02  2.624e+00  42.776  < 2e-16 ***
## season91     1.109e+02  2.624e+00  42.267  < 2e-16 ***
## season92     1.048e+02  2.624e+00  39.949  < 2e-16 ***
## season93     1.060e+02  2.624e+00  40.374  < 2e-16 ***
## season94     3.007e+01  2.624e+00  11.457  < 2e-16 ***
## season95     3.221e+01  2.624e+00  12.274  < 2e-16 ***
## season96     3.037e+00  2.624e+00   1.157  0.24731    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.66 on 3737 degrees of freedom
## Multiple R-squared:  0.9597, Adjusted R-squared:  0.9586 
## F-statistic: 917.3 on 97 and 3737 DF,  p-value: < 2.2e-16

All features seems significant.

We check the residuals of the model:

ggtsdisplay(model$residuals)

There is an exponential decrease of the ACF and significant spike at lag 5 on the PACF. This looks like an AR5 model.

We can test the model:

tmp=model$residuals
fit3 = Arima(tmp,order=c(5,0,0))
checkresiduals(fit3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with non-zero mean
## Q* = 348.19, df = 186, p-value = 5.885e-12
## 
## Model df: 6.   Total lags used: 192
ggtsdisplay(fit3$residuals)

The residuals still does not look entirely like white noise. However, looking at the significance level of the spikes on the acf/pacf plots and the fact that other orders trials ended up in more important auto-correlations, we continue with this model for the residuals.

Back to the entire model:

fit_sacom = Arima(power_train2[,"Production"],xreg=power_train2[,"Temp"],order=c(5,0,0),seasonal = c(0,1,0))
ggtsdisplay(fit_sacom$residuals)

There are still some significant auto-correlation: a spike at lag 96 on the ACF and an exponential decrease on the PACF plot on seasonal lags. We will model it with a seasonal MA1.

fit_sa_co_m = Arima(power_train2[,"Production"],xreg=power_train2[,"Temp"],order=c(5,0,0),seasonal = c(0,1,1))
ggtsdisplay(fit_sa_co_m$residuals)

The auto-correlations are less significant with this model. We will use this model for forecasting.

prev_sa_co_m = forecast(fit_sa_co_m,h=672,xreg=temp_test)
autoplot(power_test)+autolayer(prev_sa_co_m$mean, series="Manual SARIMA with covariate")

RMSE Manual SARIMA with covariate:

rmse_sa_co_m = sqrt(mean((prev_sa_co_m$mean-power_test)^2))
rmse_sa_co_m
## [1] 14.61462

Vectorial Auto-Regressive model

library(vars)
## Loading required package: MASS
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
dataVar=cbind(power,temperature)
VARselect(dataVar,lag.max=96, type='const')$selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##     96     96     96     96

The best VAR parameter found is 96 in regard to best criterion selection.

We use it for the contruction of our VAR model:

var <- VAR(dataVar, p=96,type='const')
datatrain=cbind(power_train,temp_train)

var2 = VAR(datatrain, p=96)
fcst2=forecast(var2, h=672)
plot(fcst2)

RMSE for the VAR power:

rmse_var_p=sqrt(mean((power_test-fcst2$forecast$power_train$mean)^2))
rmse_var_p
## [1] 19.37745

Summary of all models forecasting compared to test set

autoplot(power_test)+
  autolayer(prev_hw$mean,series='HW without covariate')+
  autolayer(prev$mean,series="Auto SARIMA without covariate")+
  autolayer(prev_sa_m$mean,series="Manual SARIMA without covariate")+
  autolayer(prev_power_cov$mean, series='Auto SARIMA with covariate')+
  autolayer(prevNN$mean,series="NNAR")+
  autolayer(fcst2$forecast$power_train$mean,series='VAR')

Models <- c('HW', 'Auto SARIMA w/o covariate', 'Manual SARIMA w/o covariate', 'NNAR', 'Auto SARIMA w/ covariate', 'Manual SARIMA w/ covariate', 'VAR')
RMSE <- c(rmse_hw,rmse_sa, rmse_sa_m, rmse_NN, rmse_sa_co, rmse_sa_co_m,rmse_var_p)
error.data <- data.frame(Models,RMSE)
print(error.data)
##                        Models     RMSE
## 1                          HW 20.19866
## 2   Auto SARIMA w/o covariate 15.37603
## 3 Manual SARIMA w/o covariate 14.74924
## 4                        NNAR 63.31427
## 5    Auto SARIMA w/ covariate 15.11528
## 6  Manual SARIMA w/ covariate 14.61462
## 7                         VAR 19.37745

In regard to the values obtained for RMSE, the best model using outdoor temperature for forecasting is the manual SARIMA with covariate (SARIMA(5,0,0)(0,1,1)[96]) and the one not using outdoor temperature is the manual SARIMA without covariate (SARIMA(0,1,4)(0,1,1)[96]).

Forecasting of the power consumption on the 2/17/2010

Without using the outdoor temperature

final_fit = Arima(power,order=c(0,1,4),seasonal=c(0,1,1))
final_prev=forecast(final_fit,h=96)
autoplot(final_prev)+
  ggtitle('Forcasting of the power consumption of the 17th of Feb 2010')+
  labs(y='Power consumption')

Using outdoor temperature

temp_final=ts(data$Temp, start=c(48,1), end=c(48,96),freq=96)
final_fit2 = Arima(power,order=c(5,0,0),seasonal=c(0,1,1), xreg=temperature)
final_prev2=forecast(final_fit2,xreg=temp_final,h=96)
autoplot(final_prev2)+
  ggtitle('Forcasting of the power consumption of the 17th of Feb 2010 \n using outdoor Temperature')+
  labs(y='Power consumption')

Concatenation of the two forecast and export to xlsx file

forecast_value <- cbind(final_prev$mean, final_prev2$mean)
write.xlsx(forecast_value, file = "CHERID.xlsx",
      sheetName = "Forecast", row.names=FALSE, col.names=FALSE, append = FALSE)